None Ch4
In [ ]:
# <h1>Table of Contents<span class="tocSkip"></span></h1>
# <div class="toc"><ul class="toc-item"><li><span><a href="#Import-dep" data-toc-modified-id="Import-dep-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import dep</a></span></li><li><span><a href="#Example-4.1" data-toc-modified-id="Example-4.1-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Example 4.1</a></span><ul class="toc-item"><li><span><a href="#Ex-4.1" data-toc-modified-id="Ex-4.1-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Ex 4.1</a></span></li><li><span><a href="#Ex-4.2" data-toc-modified-id="Ex-4.2-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Ex 4.2</a></span></li><li><span><a href="#Ex-4.4" data-toc-modified-id="Ex-4.4-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Ex 4.4</a></span></li></ul></li><li><span><a href="#Ex-4.5" data-toc-modified-id="Ex-4.5-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Ex 4.5</a></span></li><li><span><a href="#Ex-4.7" data-toc-modified-id="Ex-4.7-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Ex 4.7</a></span><ul class="toc-item"><li><span><a href="#Replicate-Fig.-4.2" data-toc-modified-id="Replicate-Fig.-4.2-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Replicate Fig. 4.2</a></span></li><li><span><a href="#Add-more-details" data-toc-modified-id="Add-more-details-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Add more details</a></span></li></ul></li></ul></div>
In [1]:
from IPython.display import display, HTML

# Set the notebook width to 80%
display(HTML("<style>.container { width: 80% !important; }</style>"))
In [2]:
!jupyter notebook list
Currently running servers:
http://localhost:3710/ :: /home/kzy816
In [3]:
# Needs to paste `http://localhost:3110`, no ending `/`
port = 3710

import IPython
import json
import requests

hostname = !hostname

# Get the current Jupyter server's info
result = !jupyter notebook list
for res in result:
    if f'http://localhost:{port}/' in res:
        result = res.split(' :: ')[0]
        break

# Print the server URL
print(f'Current Jupyter server {hostname} URL: {result}')

# Get the list of running notebooks
response = requests.get(f'{result}api/sessions')

# # Convert the JSON data to a string and print it
# print(json.dumps(response.json(), indent=4))

nbs = response.json()
nb_names = [nb['name'] for nb in nbs]
print(len(nb_names), nb_names)
Current Jupyter server ['qnode9024'] URL: http://localhost:3710/
1 ['Ch4_1-182269aa-ad3d-4bd8-ba86-c41c28623830.ipynb']

1. Import dep

In [4]:
from itertools import product

import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
In [5]:
import plotly
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.io as pio

pio.renderers.default = "notebook"
In [6]:
COLOR_LIST = plotly.colors.DEFAULT_PLOTLY_COLORS
len(COLOR_LIST)
Out[6]:
10
In [7]:
print(plotly.__version__, plotly.__path__)
5.22.0 ['/home/kzy816/.conda/envs/p311/lib/python3.11/site-packages/plotly']

2. Example 4.1

In [8]:
def state_val_update1(vnn, ga):
    nr, nc = vnn.shape
    # Using the vnn values below to use the newly obtained v_{n+1} values.
    for i, j in product(range(nr), range(nc)):
        if (i==0 and j==0) or (i==nr-1 and j==nc-1):
            vnn[i, j] = 0
        else:
            # Corners
            if i==0 and j==nc-1:
                vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j], ga*vnn[i, j], ga*vnn[i+1, j]])/4
            elif i==nr-1 and j==0:
                vnn[i, j] = -1+sum([ga*vnn[i, j], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i, j]])/4
            # Boundaries
            elif i==0:
                vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j+1], ga*vnn[i, j], ga*vnn[i+1, j]])/4
            elif i==nr-1:
                vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i, j]])/4
            elif j==0:
                vnn[i, j] = -1+sum([ga*vnn[i, j], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i+1, j]])/4
            elif j==nc-1:
                vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j], ga*vnn[i-1, j], ga*vnn[i+1, j]])/4
            # Inner
            else:
                vnn[i, j] = -1+sum([ga*vnn[i, j-1], ga*vnn[i, j+1], ga*vnn[i-1, j], ga*vnn[i+1, j]])/4


def eval_schema2(
    state_val_update_func,
    v0: np.ndarray, ga: float, max_iter: int, memo_step: int = 1, plt_trace=False, trace_ij=(0, 0)
):
    """
    When updating the value function at n+1, newly calcualted n+1 values are also used.
    RLAI book call this "in-place" algorithm.
    """
    vn = v0.copy()
    memo_v = {}
    for si in range(max_iter):
        vnn = vn.copy()
        
        state_val_update_func(vnn, ga)
            
        if si%memo_step==0:
            memo_v[si] = vn

        vn = vnn
    
    memo_v[max_iter] = vn
    
    if plt_trace:
        fig = make_subplots(rows=1, cols=1)
        arr_si = np.array(sorted(memo_v.keys()))
        vs = [memo_v[si][trace_ij] for si in arr_si]
        fig.add_trace(go.Scatter(x=arr_si+1, y=vs, mode='lines+markers'), row=1, col=1)
        fig.update_layout(
            xaxis=dict(title='Iteration', type='log'),
            title=f'Value Function at Iteration {trace_ij}', height=600, width=600)
        fig.show()
    
    return memo_v

def pr_val_func(val: np.ndarray, fmt='%8.3f', reshape=False, nr=None, nc=None):
    if reshape:
        print(np.array2string(val.reshape(nr, nc, order='C'), formatter={'float_kind':lambda x: fmt % x}))
    else:
        print(np.array2string(val, formatter={'float_kind':lambda x: fmt % x}))
        

2.1. Ex 4.1

In [ ]:
v0 = np.zeros((4, 4))
ga = 1.0
max_iter = 1000
memo_step = 10
memo_v = eval_schema2(state_val_update1, v0, ga, max_iter, memo_step, plt_trace=True, trace_ij=(1, 3))
pr_val_func(memo_v[max_iter])
[[   0.000  -14.000  -20.000  -22.000]
 [ -14.000  -18.000  -20.000  -20.000]
 [ -20.000  -20.000  -18.000  -14.000]
 [ -22.000  -20.000  -14.000    0.000]]
In [ ]:
q_11_down = -1+memo_v[max_iter][3, 3]
q_7_down = -1+memo_v[max_iter][2, 3]
q_11_down, q_7_down
Out[ ]:
(-1.0, -14.999999999999986)

2.2. Ex 4.2

In [ ]:
def state_val_update2(vnn, ga):
    def _ij(i, j):
        return i*4+j
    nr, nc = 4, 4
    # Using the vnn values below to use the newly obtained v_{n+1} values.
    for i, j in product(range(nr), range(nc)):
        if (i==0 and j==0) or (i==nr-1 and j==nc-1):
            vnn[_ij(i, j)] = 0
        else:
            # Corners
            if i==0 and j==nc-1:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i+1, j)]])/4
            elif i==nr-1 and j==0:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i, j)]])/4
            # Boundaries
            elif i==0:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i+1, j)]])/4
            elif i==nr-1:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i, j)]])/4
            elif j==0:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i+1, j)]])/4
            elif j==nc-1:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i+1, j)]])/4
            # Inner
            else:
                vnn[_ij(i, j)] = -1+sum([ga*vnn[_ij(i, j-1)], ga*vnn[_ij(i, j+1)], ga*vnn[_ij(i-1, j)], ga*vnn[_ij(i+1, j)]])/4
    vnn[-1] = -1+sum([ga*vnn[_ij(3, 0)], ga*vnn[_ij(3, 1)], ga*vnn[_ij(3, 2)], ga*vnn[-1]])/4
In [ ]:
v0 = np.zeros((17,))
ga = 1.0
max_iter = 1000
memo_step = 10
memo_v = eval_schema2(state_val_update2, v0, ga, max_iter, memo_step, plt_trace=True, trace_ij=-1)
pr_val_func(memo_v[max_iter][:16], reshape=True, nr=-1, nc=4)
memo_v[max_iter][-1]
[[   0.000  -14.000  -20.000  -22.000]
 [ -14.000  -18.000  -20.000  -20.000]
 [ -20.000  -20.000  -18.000  -14.000]
 [ -22.000  -20.000  -14.000    0.000]]
Out[ ]:
-19.99999999999998

2.3. Ex 4.4

  • The bug is that the optimal policy is not unique. So even though the policy improvement shows that the policy-stable=false, the algorithm may not give a better state-value function in the next policy evaluation. Actually, the algorithm can circulating between different optimal policies. (In other words, ties should be broken in a consistent order.)
    • One way to fix it is that in policy improvement, we don't use the current way to assign $\pi(s)$. Instead, we check if there is an action giving a better value of existing $p(s, a)$. If so, we assign the action to the $\pi(s)$ and assign policy-stable=false, otherwise we don't do anything for the current $\pi(s)$.
    • Another way of fixing is that, within policy evaluation, we can also check if the current state-value function is close to the state-value function in the last time policy evaluation. If there is a strict improvement in

3. Ex 4.5

\begin{align*} q_{\pi}(s, a) &= \mathbb{E}[R_{t+1}+\gamma*v_{\pi}(S_{t+1})|s, a] \\ &= \mathbb{E}[R_{t+1}+\gamma*\sum_{a}\pi(a|S_{t+1})q_{\pi}(S_{t+1}, a)|s, a] \\ &= \sum_{s', r}p(s',r|s,a)(r+\gamma*\sum_{a}\pi(a|s')q_{\pi}(s', a)) \end{align*}

At each time step $k$, we have $q_k(s,a)$. Then, we can use this $q_k(s,a)$ to evaluate $q_{k+1}(s,a)$ using the equation above. This is policy evaluation for the action-value function.

4. Ex 4.7

4.1. Replicate Fig. 4.2

  • The solution is the same with Fig. 4.2.
In [25]:
from itertools import product
import math
import time

from joblib import Parallel, delayed, parallel_backend

N_JOBS = 20

MV_COST = 2
RENT_RWD = 10
ARR1, ARR2 = 3, 4
RET1, RET2 = 3, 2
MAX_CARS = 20
MAX_CAR_MOVE = 5

POISSON_CACHE_ARR1 = [np.exp(-ARR1)*ARR1**arr1/math.factorial(arr1) for arr1 in range(MAX_CARS)]
POISSON_CACHE_CUMU_ARR1 = np.cumsum(POISSON_CACHE_ARR1)
POISSON_CACHE_ARR2 = [np.exp(-ARR2)*ARR2**arr2/math.factorial(arr2) for arr2 in range(MAX_CARS)]
POISSON_CACHE_CUMU_ARR2 = np.cumsum(POISSON_CACHE_ARR2)
POISSON_CACHE_RET1 = [np.exp(-RET1)*RET1**ret1/math.factorial(ret1) for ret1 in range(MAX_CARS)]
POISSON_CACHE_CUMU_RET1 = np.cumsum(POISSON_CACHE_RET1)
POISSON_CACHE_RET2 = [np.exp(-RET2)*RET2**ret2/math.factorial(ret2) for ret2 in range(MAX_CARS)]
POISSON_CACHE_CUMU_RET2 = np.cumsum(POISSON_CACHE_RET2)

# def car_rental_trans(c1, c2, mv, c1n, c2n):
#     """
#     Calculate the transition probability and expected reward for the car rental problem from 
#     state (c1, c2) to state (c1n, c2n) by moving mv cars from location 1 to location 2.
    
#     c1 is the number of cars at location 1.
#     c2 is the number of cars at location 2.
#     mv is the number of cars moved from location 1 to location 2.
#     c1n is the number of cars at location 1 at the end of the next day.
#     c2n is the number of cars at location 2 at the end of the next day.
#     """
#     if ((mv>0 and c1<mv) or (mv<0 and c2<-mv)):
#         return 0.0, 0.0
#     c1_be = min(MAX_CARS, c1-mv)
#     c2_be = min(MAX_CARS, c2+mv)
    
#     # c1n = min(MAX_CARS, c1_be-min(c1_be, arr1)+ret1)
#     r_sa_sp, p_sa_sp = 0, 0
#     for arr1, arr2 in product(range(c1_be+1), range(c2_be+1)):
#         ret1 = c1n-(c1_be-arr1)
#         ret2 = c2n-(c2_be-arr2)
#         if ret1<0 or ret2<0:
#             continue
#         p_arr1 = POISSON_CACHE_ARR1[arr1] if arr1<c1_be else 1-POISSON_CACHE_CUMU_ARR1[c1_be-1]
#         p_arr2 = POISSON_CACHE_ARR2[arr2] if arr2<c2_be else 1-POISSON_CACHE_CUMU_ARR2[c2_be-1]
#         p_ret1 = POISSON_CACHE_RET1[ret1] if c1n<MAX_CARS else 1-POISSON_CACHE_CUMU_RET1[ret1-1]
#         p_ret2 = POISSON_CACHE_RET2[ret2] if c2n<MAX_CARS else 1-POISSON_CACHE_CUMU_RET2[ret2-1]            
#         prob = p_arr1*p_arr2*p_ret1*p_ret2
#         r_sa_sp += RENT_RWD*(arr1+arr2)*prob
#         p_sa_sp += prob
#     r_sa_sp -= MV_COST*abs(mv)*p_sa_sp
#     return r_sa_sp, p_sa_sp

def car_rental_one_loc_trans(c_be, c_ed, loc):
    """
    Calculate the transition probability and expected reward for the car rental problem from 
    state c_be to state c_ed.
    
    c_be is the number of cars at the beginning of the day.
    c_ed is the number of cars at the end of the day.
    loc is the location of the cars.
    """
    r_sa_sp, p_sa_sp = 0, 0
    # arr>=c_be-c_ed
    arr_arr = np.arange(max(0, c_be-c_ed), c_be+1)
    pois_cache_arr = POISSON_CACHE_ARR1 if loc==1 else POISSON_CACHE_ARR2
    pois_cache_cum_arr = POISSON_CACHE_CUMU_ARR1 if loc==1 else POISSON_CACHE_CUMU_ARR2
    pois_cache_ret = POISSON_CACHE_RET1 if loc==1 else POISSON_CACHE_RET2
    pois_cache_cum_ret = POISSON_CACHE_CUMU_RET1 if loc==1 else POISSON_CACHE_CUMU_RET2
    # There can be a subtle bug here when c_be=0
    p_arr = np.array([    
        pois_cache_arr[arr] if arr<c_be else ((1-pois_cache_cum_arr[c_be-1]) if c_be>0 else 1)
        for arr in arr_arr
    ])
    # There can be a subtle bug here when ret=0
    def _ret_prob(arr):
        ret = c_ed-(c_be-arr)
        return pois_cache_ret[ret] if c_ed<MAX_CARS else ((1-pois_cache_cum_ret[ret-1]) if ret>0 else 1)
    p_ret = np.array([_ret_prob(arr) for arr in arr_arr])
    r_sa_sp = RENT_RWD*np.sum(arr_arr*p_arr*p_ret)
    p_sa_sp = np.sum(p_arr*p_ret)
    
    return r_sa_sp, p_sa_sp

def car_rental_trans1(c1, c2, mv, c1n, c2n):
    """
    Calculate the transition probability and expected reward for the car rental problem from 
    state (c1, c2) to state (c1n, c2n) by moving mv cars from location 1 to location 2.
    
    c1 is the number of cars at location 1.
    c2 is the number of cars at location 2.
    mv is the number of cars moved from location 1 to location 2.
    c1n is the number of cars at location 1 at the end of the next day.
    c2n is the number of cars at location 2 at the end of the next day.
    """
    if ((mv>0 and c1<mv) or (mv<0 and c2<-mv)):
        return 0.0, 0.0
    c1_be = min(MAX_CARS, c1-mv)
    c2_be = min(MAX_CARS, c2+mv)
    
    r_sa_sp1, p_sa_sp1 = car_rental_one_loc_trans(c1_be, c1n, loc=1)
    r_sa_sp2, p_sa_sp2 = car_rental_one_loc_trans(c2_be, c2n, loc=2)
    p_sa_sp = p_sa_sp1*p_sa_sp2
    # Be careful with the probability of the cost
    r_sa_sp = r_sa_sp1*p_sa_sp2+r_sa_sp2*p_sa_sp1-MV_COST*abs(mv)*p_sa_sp
    return r_sa_sp, p_sa_sp

def car_rental_pol_eval(
    car_rental_trans_func: callable,
    arr_pol: np.ndarray, arr_val: np.ndarray, ga: float, pol_iter: int,
    tol: 1e-4, max_iter: int, memo_step: int = 1, plt_trace=False, trace_ij=(0, 0)
):
    nr, nc = arr_pol.shape
    arr_valn = arr_val.copy()
    memo = {0: arr_valn[trace_ij]}
    m_diff, avg_diff, tic = 0, 0, time.time()
    # abs_tol = max(tol*RENT_RWD, 2**(-pol_iter))
    abs_tol = tol*RENT_RWD
    
    def _pol_eval_one_sweep(arr_val):
        def _one_update(i, j):
            arr_r_sa_sp, arr_p_sa_sp = np.zeros_like(arr_val), np.zeros_like(arr_val)
            for ni, nj in product(range(nr), range(nc)):
                r_sa_sp, p_sa_sp = car_rental_trans_func(i, j, arr_pol[i, j], ni, nj)
                arr_r_sa_sp[ni, nj], arr_p_sa_sp[ni, nj] = r_sa_sp, p_sa_sp
            return np.sum(arr_r_sa_sp+ga*arr_p_sa_sp*arr_val)
        with parallel_backend('loky', n_jobs=N_JOBS):
            res = Parallel(verbose=0, pre_dispatch="1.5*n_jobs")(
                delayed(_one_update)(i, j) for i, j in product(range(nr), range(nc))
            )
        arr_valn = np.array(res).reshape(nr, nc, order='C')
        return arr_valn
                
    for si in range(max_iter):
        if si%10==0:
            print(f'Policy evaluation iteration {si}: {m_diff:.2f}:{avg_diff:.2f} ({time.time()-tic:.2f}s)')
        m_diff, avg_diff, tic = 0, 0, time.time()
        # for i, j in product(range(nr), range(nc)):
        #     # print(i, j)
        #     # valn = 0
        #     # for mv, ni, nj in product(range(-MAX_CAR_MOVE, MAX_CAR_MOVE+1), range(nr), range(nc)):
        #     #     print(i, j, mv, ni, nj)
        #     #     r_sa_sp, p_sa_sp = car_rental_trans(i, j, mv, ni, nj)
        #     #     valn += r_sa_sp + ga*p_sa_sp*arr_valn[ni, nj]
        #     with parallel_backend('loky', n_jobs=N_JOBS):
        #         res = Parallel(verbose=0, pre_dispatch="1.5*n_jobs")(
        #             delayed(car_rental_trans)(i, j, arr_pol[i, j], ni, nj) 
        #             for ni, nj in product(range(nr), range(nc))
        #         )
        #     arr_r_sa_sp, arr_p_sa_sp = zip(*res)
        #     arr_r_sa_sp, arr_p_sa_sp = np.array(arr_r_sa_sp), np.array(arr_p_sa_sp)
        #     valn = np.sum(arr_r_sa_sp+ga*arr_p_sa_sp*arr_valn.reshape(-1, order='C'))
        #     m_diff = max(m_diff, abs(valn-arr_valn[i, j]))
        #     avg_diff += abs(valn-arr_valn[i, j])
        #     arr_valn[i, j] = valn
        arr_valn = _pol_eval_one_sweep(arr_val)
        m_diff = np.max(np.abs(arr_valn-arr_val))
        avg_diff = np.mean(np.abs(arr_valn-arr_val))
        arr_val = arr_valn
        if si%memo_step==0:
            memo[si] = arr_valn[trace_ij]
        if m_diff<abs_tol:
            print(f'Converged at iteration {si}')
            break
            
    if plt_trace:
        fig = make_subplots(rows=1, cols=1)
        arr_si = np.array(sorted(memo.keys()))
        vs = [memo[si] for si in arr_si]
        fig.add_trace(go.Scatter(x=arr_si+1, y=vs, mode='lines+markers'), row=1, col=1)
        fig.update_layout(
            xaxis=dict(title='Iteration', type='log'),
            title=f'Value Function at Iteration {trace_ij}', height=600, width=600
        )
        fig.show()
    return arr_valn

def eval_act_val(car_rental_trans_func, arr_val, i, j, mv, ga):
    nr, nc = arr_val.shape
    q_sa = 0
    trans_prob = 0
    for ni, nj in product(range(nr), range(nc)):
        r_sa_sp, p_sa_sp = car_rental_trans_func(i, j, mv, ni, nj)
        q_sa += r_sa_sp + ga*p_sa_sp*arr_val[ni, nj]
        trans_prob += p_sa_sp
    return q_sa

# def car_rental_pol_impr(arr_pol: np.ndarray, arr_val: np.ndarray, ga: float):
#     nr, nc = arr_pol.shape
#     arr_poln = arr_pol.copy()
#     pol_stab = True
#     for i, j in product(range(nr), range(nc)):
#         q_sa_memo = {}
#         a_max, q_max = arr_pol[i, j], -np.inf
#         for mv in range(-MAX_CAR_MOVE, MAX_CAR_MOVE+1):
#             q_sa_memo[mv] = eval_act_val(arr_val, i, j, mv, ga)
#             if q_sa_memo[mv]>q_max:
#                 q_max = q_sa_memo[mv]
#                 a_max = mv
        
#         if q_max>q_sa_memo[arr_pol[i, j]]:
#             arr_poln[i, j] = a_max
#             pol_stab = False
#     return pol_stab, arr_poln

def car_rental_pol_impr(car_rental_trans_func: callable, arr_pol: np.ndarray, arr_val: np.ndarray, ga: float):
    nr, nc = arr_pol.shape
    arr_poln = arr_pol.copy()
    
    def _one_update(i, j):
        q_sa_memo = {}
        a_max, q_max = arr_pol[i, j], -np.inf
        for mv in range(-MAX_CAR_MOVE, MAX_CAR_MOVE+1):
            q_sa_memo[mv] = eval_act_val(car_rental_trans_func, arr_val, i, j, mv, ga)
            if q_sa_memo[mv]>q_max:
                a_max, q_max = mv, q_sa_memo[mv]
        if q_max>arr_val[i, j]:
            return a_max
        else:
            return arr_pol[i, j]
    
    with parallel_backend('loky', n_jobs=N_JOBS):
        res = Parallel(verbose=0, pre_dispatch="1.5*n_jobs")(
            delayed(_one_update)(i, j) for i, j in product(range(nr), range(nc))
        )
    arr_poln = np.array(res).reshape(nr, nc, order='C')
    pol_stab = np.all(arr_poln==arr_pol)
    
    return pol_stab, arr_poln

def plot_arr_pol(arr_pol):
    
    fig = go.Figure(data=go.Heatmap(
        z=arr_pol,
        colorscale='aggrnyl',
        x=[i for i in range(MAX_CARS+1)],
        y=[j for j in range(MAX_CARS+1)],
        hoverongaps=False))
    fig.update_layout(
        title='Policy Heatmap',
        xaxis=dict(title='Number of Cars at Location 2', showgrid=True, gridwidth=1, gridcolor='black'),
        yaxis=dict(title='Number of Cars at Location 1', showgrid=True, gridwidth=1, gridcolor='black'),
        autosize=False,
        width=500,
        height=500,
    )
    fig.show()

def plot_arr_val(arr_val):
    fig = go.Figure(data=go.Heatmap(
        z=arr_val,
        colorscale='aggrnyl',
        x=[i for i in range(MAX_CARS+1)],
        y=[j for j in range(MAX_CARS+1)],
        hoverongaps=False))

    fig.update_layout(
        title='Heatmap of arr_val',
        xaxis=dict(title='Number of Cars at Location 2', showgrid=True, gridwidth=1, gridcolor='black'),
        yaxis=dict(title='Number of Cars at Location 1', showgrid=True, gridwidth=1, gridcolor='black'),
        autosize=False,
        width=500,
        height=500,
    )

    fig.show()

def car_rental_pol_iter(car_rental_trans_func, ga, max_pol_iter=-1, plt_iter=False, **pol_eval_kwargs):
    """
    pol_eval_kwargs = {'tol': 1e-4, 'max_iter': 1000, 'memo_step': 1, 'plt_trace': False, 'trace_ij': (0, 0)}
    """        
    arr_pol = np.zeros((MAX_CARS+1, MAX_CARS+1), dtype=int)
    arr_val = np.zeros((MAX_CARS+1, MAX_CARS+1), dtype=float)
    # for i, j in product(range(MAX_CARS+1), range(MAX_CARS+1)):
    #     arr_val[i, j] = min(i, ARR1)+min(j, ARR2)
    # arr_val *= RENT_RWD/(1-ga)
    
    scnt = 0
    while True:
        scnt += 1
        print("@"*10+f'Policy iteration {scnt}'+"@"*10)
        arr_valn = car_rental_pol_eval(car_rental_trans_func, arr_pol, arr_val, ga, scnt, **pol_eval_kwargs)
        pol_stab, arr_poln = car_rental_pol_impr(car_rental_trans_func, arr_pol, arr_valn, ga)
        if plt_iter:
            print("="*10+f'Iteration {scnt}'+"="*10)
            plot_arr_pol(arr_pol)
            plot_arr_val(arr_valn)
        if pol_stab or (max_pol_iter>0 and scnt>=max_pol_iter):
            print(f'Policy at iteration {scnt} (policy stable: {pol_stab})')
            arr_val, arr_pol = arr_valn, arr_poln
            break
        arr_val_diff = arr_valn-arr_val
        print(f'Policy at iteration {scnt} (policy stable: {pol_stab}) with min and max val improvement: {np.min(arr_val_diff):.2f}, {np.max(arr_val_diff):.2f}')
        arr_val, arr_pol = arr_valn, arr_poln
    
    return arr_pol, arr_val
In [23]:
arr_val = np.zeros((MAX_CARS+1, MAX_CARS+1), dtype=float)
for i, j in product(range(MAX_CARS+1), range(MAX_CARS+1)):
    arr_val[i, j] = min(i, ARR1)+min(j, ARR2)
arr_val *= RENT_RWD/(1-0.9)
plot_arr_val(arr_val)
In [24]:
pol_eval_kwargs = {'tol': 1e-3, 'max_iter': 1000, 'memo_step': 10, 'plt_trace': True, 'trace_ij': (10, 10)}
_ = car_rental_pol_iter(car_rental_trans1, 0.9, max_pol_iter=5, plt_iter=True, **pol_eval_kwargs)
@@@@@@@@@@Policy iteration 1@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 21.79:19.31 (0.71s)
Policy evaluation iteration 20: 6.67:6.54 (0.71s)
Policy evaluation iteration 30: 2.30:2.28 (0.70s)
Policy evaluation iteration 40: 0.80:0.80 (0.73s)
Policy evaluation iteration 50: 0.28:0.28 (0.74s)
Policy evaluation iteration 60: 0.10:0.10 (0.74s)
Policy evaluation iteration 70: 0.03:0.03 (0.72s)
Policy evaluation iteration 80: 0.01:0.01 (0.72s)
Converged at iteration 81
==========Iteration 1==========
Policy at iteration 1 (policy stable: False) with min and max val improvement: 407.09, 611.32
@@@@@@@@@@Policy iteration 2@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 0.51:0.47 (0.73s)
Policy evaluation iteration 20: 0.18:0.17 (0.73s)
Policy evaluation iteration 30: 0.06:0.06 (0.72s)
Policy evaluation iteration 40: 0.02:0.02 (0.71s)
Converged at iteration 47
==========Iteration 2==========
Policy at iteration 2 (policy stable: False) with min and max val improvement: 8.50, 75.14
@@@@@@@@@@Policy iteration 3@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 0.48:0.24 (0.73s)
Policy evaluation iteration 20: 0.07:0.06 (0.74s)
Policy evaluation iteration 30: 0.02:0.02 (0.74s)
Converged at iteration 36
==========Iteration 3==========
Policy at iteration 3 (policy stable: False) with min and max val improvement: 2.96, 12.60
@@@@@@@@@@Policy iteration 4@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 0.02:0.01 (0.73s)
Converged at iteration 13
==========Iteration 4==========
Policy at iteration 4 (policy stable: False) with min and max val improvement: 0.11, 0.82
@@@@@@@@@@Policy iteration 5@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Converged at iteration 1
==========Iteration 5==========
Policy at iteration 5 (policy stable: True)

4.2. Add more details

In [27]:
PARKING_THRE = 10
EXTRA_PARKING_COST = 4

def car_rental_trans2(c1, c2, mv, c1n, c2n):
    """
    Calculate the transition probability and expected reward for the car rental problem from 
    state (c1, c2) to state (c1n, c2n) by moving mv cars from location 1 to location 2.
    
    c1 is the number of cars at location 1.
    c2 is the number of cars at location 2.
    mv is the number of cars moved from location 1 to location 2.
    c1n is the number of cars at location 1 at the end of the next day.
    c2n is the number of cars at location 2 at the end of the next day.
    """
    if ((mv>0 and c1<mv) or (mv<0 and c2<-mv)):
        return 0.0, 0.0
    c1_be = min(MAX_CARS, c1-mv)
    c2_be = min(MAX_CARS, c2+mv)
    
    r_sa_sp1, p_sa_sp1 = car_rental_one_loc_trans(c1_be, c1n, loc=1)
    r_sa_sp2, p_sa_sp2 = car_rental_one_loc_trans(c2_be, c2n, loc=2)
    p_sa_sp = p_sa_sp1*p_sa_sp2
    # Be careful with the probability of the cost
    cost = MV_COST*abs(mv-int(mv>0))+EXTRA_PARKING_COST*(int(c1n>PARKING_THRE)+int(c2n>PARKING_THRE))
    r_sa_sp = r_sa_sp1*p_sa_sp2+r_sa_sp2*p_sa_sp1-cost*p_sa_sp
    return r_sa_sp, p_sa_sp
In [28]:
pol_eval_kwargs = {'tol': 1e-3, 'max_iter': 1000, 'memo_step': 10, 'plt_trace': True, 'trace_ij': (10, 10)}
_ = car_rental_pol_iter(car_rental_trans2, 0.9, max_pol_iter=5, plt_iter=True, **pol_eval_kwargs)
@@@@@@@@@@Policy iteration 1@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 20.42:18.50 (0.74s)
Policy evaluation iteration 20: 6.29:6.25 (0.75s)
Policy evaluation iteration 30: 2.18:2.18 (0.74s)
Policy evaluation iteration 40: 0.76:0.76 (0.73s)
Policy evaluation iteration 50: 0.27:0.27 (0.75s)
Policy evaluation iteration 60: 0.09:0.09 (0.76s)
Policy evaluation iteration 70: 0.03:0.03 (0.76s)
Policy evaluation iteration 80: 0.01:0.01 (0.73s)
Converged at iteration 81
==========Iteration 1==========
Policy at iteration 1 (policy stable: False) with min and max val improvement: 399.49, 569.10
@@@@@@@@@@Policy iteration 2@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 1.12:1.05 (0.74s)
Policy evaluation iteration 20: 0.39:0.39 (0.73s)
Policy evaluation iteration 30: 0.14:0.14 (0.75s)
Policy evaluation iteration 40: 0.05:0.05 (0.73s)
Policy evaluation iteration 50: 0.02:0.02 (0.74s)
Converged at iteration 54
==========Iteration 2==========
Policy at iteration 2 (policy stable: False) with min and max val improvement: 20.29, 110.18
@@@@@@@@@@Policy iteration 3@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 0.63:0.31 (0.74s)
Policy evaluation iteration 20: 0.10:0.08 (0.74s)
Policy evaluation iteration 30: 0.03:0.03 (0.74s)
Converged at iteration 38
==========Iteration 3==========
Policy at iteration 3 (policy stable: False) with min and max val improvement: 4.10, 20.37
@@@@@@@@@@Policy iteration 4@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Policy evaluation iteration 10: 0.02:0.01 (0.73s)
Converged at iteration 13
==========Iteration 4==========
Policy at iteration 4 (policy stable: False) with min and max val improvement: 0.11, 1.07
@@@@@@@@@@Policy iteration 5@@@@@@@@@@
Policy evaluation iteration 0: 0.00:0.00 (0.00s)
Converged at iteration 1
==========Iteration 5==========
Policy at iteration 5 (policy stable: True)
In [ ]:
 
In [ ]:
 
In [ ]: